#!/usr/bin/env python
#coding:utf-8
"""
  Author:  程勇 --<>
  Purpose: 动脑学院第22节作业
  Created: 2018/3/19
"""

import re
import os

'''1.使用正则表达式匹配电话号码：0713-xxxxxx(湖南省座机号码)'''
question_1 = re.compile(r'(\d{3,4}-\d{8})\b')
r = question_1.findall('0713-5454523 0135-4513245844 0713-11111111 015-88888888')
print(r)

'''2.区号中可以包含()或者-,而且是可选的,就是说你写的正则表达式可以匹配800-555-1212,555-1212,(800)555-1212'''
question_2 = re.compile(r'(?:\(\d{3}\)|(?:\s|^\d{3}-))\d{3}-\d{4}\b')
r2 = question_2.findall('800-555-1212 132)456-7890 (456-851-9856 (800)555-1212')
print(r2)

'''3.选作题:实现一个爬虫代码'''
import urllib.request,socket,re,sys,os,html.parser as h

baseUrl = 'http://www.kuwen.net'
basepath = 'E:\\python_test\\text\\'
filepath = ""
Webheader = {'Upgrade-Insecure-Requests':'1',
             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}


def getContant(Weburl):
    req = urllib.request.Request(url = Weburl,headers = Webheader)
    respose = urllib.request.urlopen(req)
    _contant = respose.read()
    respose.close()
    return _contant

def getUrl(URL):
    global filepath
    contant = getContant(URL)
    filename = re.findall(r'<h1>(.*?)</h1>',str(contant.decode()))
    filepath = basepath + filename[0] + ".txt"
    comp = re.compile(r'<a style="" href="(/lyd40275/\d{7}\.html)">')
    urlList1 = comp.findall(str(contant))
    urlList = []
    for url1 in urlList1:
        url2 = baseUrl + url1
        urlList.append(url2)
    return urlList

def openUrl(url):
    req = urllib.request.Request(url=url,headers=Webheader)
    res = urllib.request.urlopen(req)
    data = res.read()
    getText(data.decode())

def getText(data):
    title = re.findall(r'<h1>(\d{1,2})(.*?)</h1>',str(data))
    text = re.findall(r'&nbsp;&nbsp;&nbsp;&nbsp;(.*?)\r',str(data))
    final = re.sub(r'<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;','\n\n\u3000\u3000',str(text[0]))
    with open(filepath, 'a',encoding='utf-8') as f:
        f.write('第'+title[0][0]+'章\u3000\u3000'+title[0][1]+'\n\n')
        f.write('\u3000\u3000'+final+'\n\n')
    print(title[0][1]+"下载成功")

URL = baseUrl+'/lyd40275/'
urlList = getUrl(URL)
for url in urlList:
    openUrl(url)
